/*******************************************************************************
* Objective: Create Share of construction in GDP dataset
*******************************************************************************/

* Set Stata version
version 16

* Stablish Working Directory ***************************************************
cd "$workdirectory"

* Settings *********************************************************************
capture log close
clear all
set more off

********************************************************************************
**# Step 1: Importing and combining raw datasets
********************************************************************************

* Importing Data
import delimited "raw_datasets\gdp_un.csv", clear

* Rename countries
rename countryorarea country_wup
replace country_wup = "China" if country_wup == "China, People's Republic of"
replace country_wup = "China, Macao SAR" if country_wup == "China, Macao Special Administrative Region"
replace country_wup = "Côte d'Ivoire" if country_wup == "CÃ´te d'Ivoire"
replace country_wup = "Iran (Islamic Republic of)" if country_wup == "Iran, Islamic Republic of"
replace country_wup = "Swaziland" if country_wup == "Kingdom of Eswatini"

* Expand dataset to all countries and dates by using auxiliary file
merge 1:1 country_wup year using "raw_datasets/country_extend"
drop if year <1970
drop if year >2018
keep if _merge == 2 | _merge == 3 | ///
country_wup == "Yemen: Former Democratic Yemen" | country_wup == "Yemen: Former Yemen Arab Republic" | ///
country_wup == "United Republic of Tanzania: Mainland" | country_wup == "United Republic of Tanzania: Zanzibar" | ///
country_wup == "Former Ethiopia" | country_wup == "Former Sudan"

* Keeping and Renaming variables
keep country_wup year constructionisicf totalvalueadded
rename constructionisicf construction
rename totalvalueadded total_va

********************************************************************************
**# Step 2
/*******************************************************************************
Fixing countries with discrepancies between current and historical borders 
or autonomy
*******************************************************************************/

sort country_wup year
foreach comp of varlist construction total_va{
	
	* Yemen
	replace `comp' = 0 if `comp' == . & country_wup == "Yemen"
	* We replace 1989 and 1990 in Yemen for 0 and take the data from the former countries
	replace `comp' = 0 if country_wup == "Yemen" & year == 1989
	replace `comp' = 0 if country_wup == "Yemen" & year == 1990
	* Replacing missing data in Yemen with former countries
	foreach ye of numlist 5727/5747{
		replace `comp' = `comp'[`ye'] + `comp'[`ye'+49] + `comp'[`ye'+70] if _n == `ye'
	}
	
	* Tanzania
	foreach tan of numlist 5522/5550{
		replace `comp' = `comp'[`tan'] + `comp'[`tan'+29] if _n == `tan'
	}
	
	* Sudan
	replace `comp' = 0 if `comp' == . & country_wup == "South Sudan"
	replace `comp' = 0 if `comp' == . & country_wup == "Sudan"
	
	foreach su1 of numlist 4963/5000{
		replace `comp' = `comp'[`su1'] + `comp'[`su1'-2880] if _n == `su1'
	}
	foreach su2 of numlist 5001/5011{
		replace `comp' = `comp'[`su2'] + `comp'[`su2'-98] if _n == `su2'
	}
	
}

* Ethiopia and Eritrea: We divide "Former Ethiopia" using the respective average shares on total Construction/VA of Ethiopia+Eritrea for the years 93-95
foreach eri of numlist 1912/1931{
	replace construction = construction[`eri'+147]* (0.15726) if _n == `eri'
	replace total_va = total_va[`eri'+147]* (0.06994) if _n == `eri'
}
foreach eti of numlist 1961/1980{
	replace construction = construction[`eti'+98]*(1-0.15726) if _n == `eti'
	replace total_va = total_va[`eti'+98]* (1-0.06994) if _n == `eti'
}

********************************************************************************
**# Step 3
/*******************************************************************************
Replace 2020 for 2018; clean country list and generate construction share in VA
*******************************************************************************/

* Replacing 2018 as 2020
replace year = 2020 if year == 2018

* Droping Former/unused countries:
drop if country_wup == "United Republic of Tanzania"
replace country_wup = "United Republic of Tanzania" if country_wup == "United Republic of Tanzania: Mainland"
drop if country_wup == "United Republic of Tanzania: Zanzibar"
drop if country_wup == "Yemen: Former Democratic Yemen"
drop if country_wup == "Yemen: Former Yemen Arab Republic"
drop if country_wup == "Former Ethiopia"
drop if country_wup == "Former Sudan"
drop if country_wup == "South Sudan"

* Generating shares of construction on total value added
gen constr_gdp = (construction/total_va)*100

********************************************************************************
**# Step 4: Generate moving averages
********************************************************************************

* Sorting
sort country_wup year

* Moving Average (-1/+1)

* Generating Variable ma1
gen constr_gdp_ma1 = 0

* Estimating ma1 for the sample except 1970 and 2020
foreach ma1 of numlist 2/48{
	by country_wup: replace constr_gdp_ma1 = (constr_gdp[`ma1'-1] + constr_gdp[`ma1'] + constr_gdp[`ma1'+1])/3 if _n == `ma1' 
}

* Estimating ma1 for 1970 and 2020
by country_wup: replace constr_gdp_ma1 = (constr_gdp[1] + constr_gdp[2])/2 if _n == 1
by country_wup: replace constr_gdp_ma1 = (constr_gdp[49] + constr_gdp[48])/2 if _n == 49

* Taking moving averages with a window of 2

* Generating Variable ma2
gen constr_gdp_ma2 = 0

foreach ma2 of numlist 3/47{
	by country_wup: replace constr_gdp_ma2 = ///
	(constr_gdp[`ma2'-2] + constr_gdp[`ma2'-1] + constr_gdp[`ma2'] + constr_gdp[`ma2'+1] + constr_gdp[`ma2'+2])/5 ///
	if _n == `ma2' 
}
	
* Estimating ma2 for 1970, 1971, 2017 and 2020
by country_wup: replace constr_gdp_ma2 = (constr_gdp[1] + constr_gdp[2] + constr_gdp[3])/3 if _n == 1
by country_wup: replace constr_gdp_ma2 = (constr_gdp[1] + constr_gdp[2] + constr_gdp[3] + constr_gdp[4])/4 if _n == 2
by country_wup: replace constr_gdp_ma2 = (constr_gdp[49] + constr_gdp[48] + constr_gdp[47] + constr_gdp[46])/4 if _n == 48
by country_wup: replace constr_gdp_ma2 = (constr_gdp[49] + constr_gdp[48] + constr_gdp[47])/3 if _n == 49

save "processed_datasets\constr_gdp(02.19.21).dta", replace

********************************************************************************
**# Step 5: Filling the 1960s
/*******************************************************************************
We use allternative sources to fill the share of construction on gdp for the 
years 1960 and 1965. The sources for each country are listed in the variable 
'source', the year use for either 1960 or 1965, is listed in the variable
'constr_gdp_yused'
*******************************************************************************/

* Importing
import excel "raw_datasets/constr_gdp_UN.xlsx", sheet("Sheet1") firstrow clear
tempfile constr_gdp_UN
save `constr_gdp_UN', replace

* Merging
use "processed_datasets\constr_gdp(02.19.21).dta", clear
merge 1:1 country_wup year using `constr_gdp_UN', keepusing(constr_gdp constr_gdp_yused)

********************************************************************************
**# Step 6: Finalizing Dataset
/*******************************************************************************
Keep desired variables and years; add data for Taiwan; labeling
*******************************************************************************/

* Keeping relevant variables
keep country_wup year constr_gdp constr_gdp_ma1 constr_gdp_ma2 constr_gdp_yused

* Keeping relevant years
keep if year == 1960 | year == 1965 | year == 1970 | year == 1975 | year == 1980 | year == 1985 | year == 1990 | year == 1995 ///
| year == 2000 | year == 2005 | year == 2010 | year == 2015 | year == 2020

* The MA variables are replaced by single year values pre 1970
replace constr_gdp_ma1 = constr_gdp if year <1970
replace constr_gdp_ma2 = constr_gdp if year <1970

* Fixing Taiwan (Groningen till 2010, then Taiwan official statistics)
foreach tai in constr_gdp constr_gdp_ma1 constr_gdp_ma2{
    replace `tai' = 3.85 if country_wup == "China, Taiwan Province of China" & year == 1960
	replace `tai' = 3.99 if country_wup == "China, Taiwan Province of China" & year == 1965
	replace `tai' = 3.91 if country_wup == "China, Taiwan Province of China" & year == 1970
	replace `tai' = 5.37 if country_wup == "China, Taiwan Province of China" & year == 1975
	replace `tai' = 6.34 if country_wup == "China, Taiwan Province of China" & year == 1980
	replace `tai' = 4.01 if country_wup == "China, Taiwan Province of China" & year == 1985
	replace `tai' = 4.64 if country_wup == "China, Taiwan Province of China" & year == 1990
	replace `tai' = 5.40 if country_wup == "China, Taiwan Province of China" & year == 1995
	replace `tai' = 3.38 if country_wup == "China, Taiwan Province of China" & year == 2000
	replace `tai' = 2.24 if country_wup == "China, Taiwan Province of China" & year == 2005
	replace `tai' = 2.55 if country_wup == "China, Taiwan Province of China" & year == 2010
	replace `tai' = 3.95 if country_wup == "China, Taiwan Province of China" & year == 2015
	replace `tai' = 3.98 if country_wup == "China, Taiwan Province of China" & year == 2020
}

* Adding country codes
merge m:1 country_wup using "processed_datasets/ccode", keepusing(code_wb)
keep if _merge == 3
drop _merge
rename code_wb ccode

* Labels
la var constr_gdp "Share of construction in GDP"
la var constr_gdp_ma1 "Share of construction in GDP (MA1)"
la var constr_gdp_ma2 "Share of construction in GDP (MA2)"
la var constr_gdp_yused "Year used for observations pre-1970"

* Save
save "processed_datasets\constr_gdp(02.19.21).dta", replace
